********************************************************************************
********************************************************************************
****************** THIS IS DO FILE 0, GERERATING THE DATA **********************
********************************************************************************
********************************************************************************

*Version: 2023 April 06


* make output directory 
cap mkdir `"$output"'
cap mkdir `"$output/data"'
cap mkdir `"$output/log"'
cap mkdir `"$output/log/data"'
cap mkdir `"$output/log/data/0_data_processing"'

*log file
cap log close 
log using `"$output/log/data/0_data_processing.smcl"', replace 

di "This run uses code version from 2023/04/06"

	
local date `"$date"'



**********************************************************************************************************
** STACKS THE RAW DATA ACROSS 2013-2015, ensures unique visit ID and decodes dx codes, merges zipcode ****
**********************************************************************************************************
forvalues yy = 13/15 {
	use `"$raw_input/${raw_data_20`yy'}"', clear
	tostring visit_billing_id, gen(visit_billing_id_s)
	drop visit_billing_id
	rename visit_billing_id_s visit_billing_id

	merge m:1 idw_provider_id using `"$raw_input/$zip_code_merge"', keepusing(zipcode)
	drop if _merge == 2
	drop _merge 
	replace visit_billing_id = visit_billing_id + "`yy'"
	forval i = 1/3 {
		decode dx`i'_code, gen(dx`i'_code_s)
		drop dx`i'_code
		rename dx`i'_code_s dx`i'_code
	}
	compress
	save dta20`yy', replace
	clear
}

use dta2013.dta
append using dta2014.dta, force
append using dta2015.dta, force

forv yy = 13/15 {
	erase dta20`yy'.dta 
}



**************************************************************************
**************************************************************************
********************Pre-process and clean the raw dataset*****************
**************************************************************************
**************************************************************************

***** fix state identifiers and unique state within visit 
cap drop state payee_state_cd
gen fixed_state = state_cd

replace fixed_state = "AL" if regexm(bobfixed_payer_name_broad, "ALABAMA") == 1 
replace fixed_state = "AK" if regexm(bobfixed_payer_name_broad, "ALASKA") == 1 
replace fixed_state = "AZ" if regexm(bobfixed_payer_name_broad, "ARIZONA") == 1 
replace fixed_state = "AR" if regexm(bobfixed_payer_name_broad, "ARKANSAS") == 1 
replace fixed_state = "CA" if regexm(bobfixed_payer_name_broad, "CALIFORNIA") == 1 
replace fixed_state = "CO" if regexm(bobfixed_payer_name_broad, "COLORADO") == 1 
replace fixed_state = "CT" if regexm(bobfixed_payer_name_broad, "CONNECTICUT") == 1 
replace fixed_state = "DE" if regexm(bobfixed_payer_name_broad, "DELAWARE") == 1 
replace fixed_state = "FL" if regexm(bobfixed_payer_name_broad, "FLORIDA") == 1 
replace fixed_state = "GA" if regexm(bobfixed_payer_name_broad, "GEORGIA") == 1 
replace fixed_state = "HI" if regexm(bobfixed_payer_name_broad, "HAWAII") == 1 
replace fixed_state = "ID" if regexm(bobfixed_payer_name_broad, "IDAHO") == 1 
replace fixed_state = "IL" if regexm(bobfixed_payer_name_broad, "ILLINOIS") == 1 
replace fixed_state = "IN" if regexm(bobfixed_payer_name_broad, "INDIANA") == 1 
replace fixed_state = "IA" if regexm(bobfixed_payer_name_broad, "IOWA") == 1 
replace fixed_state = "KS" if regexm(bobfixed_payer_name_broad, "KANSAS") == 1 & regexm(bobfixed_payer_name_broad, "ARKANSAS") == 0
replace fixed_state = "KY" if regexm(bobfixed_payer_name_broad, "KENTUCKY") == 1
replace fixed_state = "LA" if regexm(bobfixed_payer_name_broad, "LOUISIANA") == 1 
replace fixed_state = "ME" if regexm(bobfixed_payer_name_broad, "MAINE") == 1 
replace fixed_state = "MD" if regexm(bobfixed_payer_name_broad, "MARYLAND") == 1 
replace fixed_state = "MA" if regexm(bobfixed_payer_name_broad, "MASSACHUSETTS") == 1 
replace fixed_state = "MI" if regexm(bobfixed_payer_name_broad, "MICHIGAN") == 1 
replace fixed_state = "MN" if regexm(bobfixed_payer_name_broad, "MINNESOTA") == 1 
replace fixed_state = "MS" if regexm(bobfixed_payer_name_broad, "MISSISSIPPI") == 1
replace fixed_state = "MO" if regexm(bobfixed_payer_name_broad, "MISSOURI") == 1 
replace fixed_state = "MT" if regexm(bobfixed_payer_name_broad, "MONTANA") == 1 
replace fixed_state = "NE" if regexm(bobfixed_payer_name_broad, "NEBRASKA") == 1 
replace fixed_state = "NV" if regexm(bobfixed_payer_name_broad, "NEVADA") == 1 
replace fixed_state = "NH" if regexm(bobfixed_payer_name_broad, "NEW HAMPSHIRE") == 1 
replace fixed_state = "NJ" if regexm(bobfixed_payer_name_broad, "NEW JERSEY") == 1 
replace fixed_state = "NM" if regexm(bobfixed_payer_name_broad, "NEW MEXICO") == 1 
replace fixed_state = "NY" if regexm(bobfixed_payer_name_broad, "NEW YORK") == 1 
replace fixed_state = "NC" if regexm(bobfixed_payer_name_broad, "NORTH CAROLINA") == 1 
replace fixed_state = "ND" if regexm(bobfixed_payer_name_broad, "NORTH DAKOTA") == 1 
replace fixed_state = "OH" if regexm(bobfixed_payer_name_broad, "OHIO") == 1 
replace fixed_state = "OK" if regexm(bobfixed_payer_name_broad, "OKLAHOMA") == 1 
replace fixed_state = "OR" if regexm(bobfixed_payer_name_broad, "OREGON") == 1 
replace fixed_state = "PA" if regexm(bobfixed_payer_name_broad, "PENNSYLVANIA") == 1 
replace fixed_state = "RI" if regexm(bobfixed_payer_name_broad, "RHODE ISLAND") == 1 
replace fixed_state = "SC" if regexm(bobfixed_payer_name_broad, "SOUTH CAROLINA") == 1 
replace fixed_state = "SD" if regexm(bobfixed_payer_name_broad, "SOUTH DAKOTA") == 1 
replace fixed_state = "TN" if regexm(bobfixed_payer_name_broad, "TENNESSEE") == 1 
replace fixed_state = "TX" if regexm(bobfixed_payer_name_broad, "TEXAS") == 1 
replace fixed_state = "UT" if regexm(bobfixed_payer_name_broad, "UTAH") == 1 
replace fixed_state = "VT" if regexm(bobfixed_payer_name_broad, "VERMONT") == 1
replace fixed_state = "VA" if regexm(bobfixed_payer_name_broad, "VIRGINIA") == 1 
replace fixed_state = "WA" if regexm(bobfixed_payer_name_broad, "WASHINGTON") == 1 
replace fixed_state = "WV" if regexm(bobfixed_payer_name_broad, "WEST VIRGINIA") == 1 
replace fixed_state = "WI" if regexm(bobfixed_payer_name_broad, "WISCONSIN") == 1 
replace fixed_state = "WY" if regexm(bobfixed_payer_name_broad, "WYOMING") == 1 


gegen state =group(fixed_state)
bys visit_billing_id: gegen sd_state = sd(state)
replace sd_state = 0 if sd_state == . 
assert sd_state == 0 
gen payee_state_cd = fixed_state
drop state sd_state fixed_state
cap drop state_cd 
gen clean_state_id = 0
foreach valid in `"AK"' `"AL"' `"AR"' `"AZ"' `"CA"' `"CO"' `"CT"' `"DC"' `"DE"' `"FL"' `"GA"' `"HI"' ///
 `"IA"' `"ID"' `"IL"' `"IN"' `"KS"' `"KY"' `"LA"' `"MA"' `"MD"' `"ME"' `"MI"' `"MN"' `"MO"' `"MS"' `"MT"' `"NC"' `"ND"' `"NE"' ///
 `"NH"' `"NJ"' `"NM"' `"NV"' `"NY"' `"OH"' `"OK"' `"OR"' `"PA"' `"RI"' `"SC"' ///
 `"SD"' `"TN"' `"TX"' `"UT"' `"VA"' `"VT"' `"WA"' `"WI"' `"WV"' `"WY"' {
	
	replace clean_state_id = 1 if payee_state_cd=="`valid'"

}

tab payee_state_cd if clean_state_id==0
keep if clean_state_id ==1

gegen state =group(payee_state_cd)

gegen state_nr = max(state)
assert state_nr == 51
drop state_nr


**************************
**** sample selection ****
**************************

drop if PartD == 1
drop if MOther == 1
drop if medicaidMCOFFS == 1
drop if military == 1
drop if missing(broad_pay_type)
drop if missing(state)


**********Switch to Back and Forth for Claim Control Number***
drop back_and_forth
rename  back_and_forth_cntl_nbr back_and_forth 


******CORRECT MEDICARE ADVANTAGE***********************
gen      MCO65    = patient_age >= 65 & third == 1
replace  third    = 0 if MCO65 == 1
replace  medicare = 1 if MCO65 == 1
replace  MA_tot   = 1 if MCO65 == 1
*******************************************************

******Create MEDIGAP & DUAL ELIGIBLES***********************
bysort  visit_billing_id: egen medigap_payer  = max(medigap)
bysort  visit_billing_id: egen dual_payer     = max(medicaiddual)
replace third  = 0      if medigap_payer == 1 | dual_payer == 1
replace MA_tot = 0      if medigap_payer == 1 | dual_payer == 1
replace medicaidFFS = 0 if medigap_payer == 1 | dual_payer == 1
replace medicaidMCO = 0 if medigap_payer == 1 | dual_payer == 1
replace MFFS   = 0      if medigap_payer == 1 | dual_payer == 1
*********************************************************
    
***Fix Patient Age 
replace patient_age = . if patient_age < 0

***FIX BACK AND FORTH*****
drop if back_and_forth > 1000
replace back_and_forth = . if back_and_forth > 100

****Drop UNITED OUTLIER***
drop if amtlost_visit > 500000 & UNITED == 1 & ~missing(amtlost_visit) 

***Drop Claim Amount Outlier:
bysort  visit_billing_id: egen maxclaim = max(amt_allowed)
sum amt_allowed, d
di "total spending is " r(sum)

sum amt_allowed if maxclaim > 1000000, d 
di "total spending is " r(sum)

drop if maxclaim > 1000000

***FIX SHARE LOST****
replace sharelost_visit = . if sharelost_visit > 1

gen mr = medicare
gen md = medicaid
gen th = third
gen mco = medicaidMCO 
gen dfs = medicaidFFS
gen mfs = MFFS
gen ma  = MA_tot 
gen tot = 1
gen gap = medigap_payer
gen dual =  dual_payer

gen     pay_type = 1
replace pay_type = 2 if dfs == 1
replace pay_type = 3 if mco == 1
replace pay_type = 4 if ma == 1
replace pay_type = 5 if th == 1
replace pay_type = 6 if gap == 1
replace pay_type = 7 if dual == 1

lab var pay_type "1: Mcare FFS; 2: Mcaid FFS; 3: Mcaid MCO; 4: MA; 5: Commercial"

gen pay_type_pooled = 1 if pay_type == 1 | pay_type == 4
replace pay_type_pooled = 2 if pay_type == 2 | pay_type == 3
replace pay_type_pooled = 3 if pay_type == 5

drop flag_denied 
gen flag_denied = claim_status_cd == 4 //claim status code == 4 is denied 

*Drop bundled service lines (These show up as not being paid, but will show up in the allowed amount)
forval i = 1/5 {
	drop if rcsa_remit_adj_cd_`i' == "97" | rcsa_remit_adj_cd_`i' == "234"
}

****drop secondary claims***
drop if  claim_status_cd==2 | claim_status_cd==3 | claim_status_cd==20 | claim_status_cd==23  | claim_status_cd==25 | gap == 1 | dual == 1

// dropping dual and medigap payers 
drop if pay_type == 7 | pay_type == 6 


// drop reversals 
bysort       claim_key: egen rev_claim = max(claim_status_cd == 22)
bysort     visit_billing_id: egen rev_visit = max(claim_status_cd == 22)
drop if rev_visit == 1  //this drops around 31,000,000  observations
bys visit_billing_id: gegen min_allowed = min(allwd_amt) 
drop if min_allowed < 0 
drop min_allowed



**************************************************
*********Prepare for Imputations******************
**************************************************

// some definitions necessary for the imputations 

*strictest condition when a claim can be considered "no problem"
gen contractline = inlist(rcsa_remit_adj_cd_1, "45")
forval i = 2/5 {
	replace contractline = 1 if rcsa_remit_adj_cd_`i' == "45"
	}
	
*consider some more codes to be "no problem"
forval i = 1/5 {
gen fineline_`i' = inlist(rcsa_remit_adj_cd_`i',"","45","1","2","3","253","223","23")
}

egen allfine=rsum(fineline_1-fineline_5)
cap drop no_prob_observation
gen no_prob_observation=allfine==5


// set patient amount to zero if patient amount is missing 
replace pat_resp_amt = 0 if missing(pat_resp_amt)

// set allowed amt to missing if equal to zero 
replace allwd_amt = . if allwd_amt == 0

// we use the paid amount as allowed amount in case of missing allowed amount, no problem observations and medicaid (zero copay)
replace allwd_amt = amt_paid_line if allwd_amt == . & pay_type_pooled == 2  & ~missing(amt_paid_line) & no_prob_observation

// drop visits with nagative payment negative payments 
gen neg_payment_obsrvd = 0 
replace  neg_payment_obsrvd = 1 if allwd_amt < 0 
bys visit_billing_id: gegen max_neg_payment = max(neg_payment_obsrvd)
drop if max_neg_payment > 0
drop neg_payment_obsrvd max_neg_payment

// drop bottom and top percent of CPT, payer, year combinations 
bys procedure_cd year pay_type_pooled: gegen  max_cutoff = pctile(allwd_amt) , p(99)
bys procedure_cd year pay_type_pooled: gegen  min_cutoff = pctile(allwd_amt) , p(1)

sum allwd_amt, d 
di "total spending is " r(sum)

sum allwd_amt if allwd_amt > max_cutoff & max_cutoff != . & allwd_amt != ., d 
di "total spending of top 1% is " r(sum)

sum allwd_amt if allwd_amt < min_cutoff  & min_cutoff != . & allwd_amt != ., d
di "total spending of bottom 1% is " r(sum)

drop if allwd_amt > max_cutoff & max_cutoff != . & allwd_amt != .
drop if allwd_amt < min_cutoff  & min_cutoff != . & allwd_amt != .

****************************************************************
****************************************************************
*********Compute Imputed allowed Amount Values******************
****************************************************************
****************************************************************
	
*Imputation based on markup of billed amount by provider and payer
gen     allwd_amtTEMP = allwd_amt
replace allwd_amtTEMP = . if allwd_amt == amt_billed_line & amt_paid_line == 0 //Many instances with a nonpayment but alloweded amt filled in as billed
replace allwd_amtTEMP = . if allwd_amt == .01 | allwd_amt == -.01
bysort idw_provider_id idw_payer_id: egen contract_pp = total(contractline*allwd_amtTEMP*(amt_billed_line>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(amt_billed_line!=.)), missing
by     idw_provider_id idw_payer_id: egen billed_pp   = total(contractline*amt_billed_line*(amt_billed_line>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(amt_billed_line!=.)), missing
gen discount_pp = contract_pp/billed_pp
gen imputation_billing = discount_pp*amt_billed_line
label var imputation_billing "Imputed Allowed based on Billed Amt. by provider and payer"


*Imputation based on CPT code of other items from same payer/payee pair 
bysort idw_provider_id idw_payer_id procedure_cd: egen num = total(contractline*allwd_amt*(amt_billed_line>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(amt_billed_line!=.)), missing
bysort idw_provider_id idw_payer_id procedure_cd: egen dem = total(contractline*(amt_billed_line>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(amt_billed_line!=.)), missing
gen imputation_cpt = num/dem
label var imputation_cpt "Imputed Allowed Amt. by provider procedure codes"



*Imnputation based on Medicare 
gen goodobs=contractline*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)
sort   procedure_cd place_of_svc_cd medicare medicaid third goodobs
by   procedure_cd place_of_svc_cd medicare medicaid third goodobs: egen mean_allwd_amtADJ=median((allwd_amtTEMP))
by   procedure_cd place_of_svc_cd: egen meanallowedMFFS=sum(MFFS*mean_allwd_amtADJ*goodobs)
by   procedure_cd place_of_svc_cd: egen tot1=sum(MFFS*goodobs)
replace meanallowedMFFS=meanallowedMFFS/tot1
by   procedure_cd: egen altmeanallowedMFFS=sum(MFFS*mean_allwd_amtADJ*goodobs)
by   procedure_cd: egen totobs=sum(MFFS*goodobs)
replace altmeanallowedMFFS=altmeanallowedMFFS/totobs

replace meanallowedMFFS=altmeanallowedMFFS if meanallowedMFFS==0

drop num dem
bysort idw_provider_id idw_payer_id: egen num = total(contractline*allwd_amtTEMP*(meanallowedMFFS>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(meanallowedMFFS!=.)), missing
bysort idw_provider_id idw_payer_id: egen dem = total(contractline*meanallowedMFFS*(meanallowedMFFS>0)*(allwd_amtTEMP>0)*(allwd_amtTEMP!=.)*(meanallowedMFFS!=.)), missing
gen ratioALT7 = num/dem
summ ratioALT7 if medicaidFFS | medicaidMCO, detail
display r(p5)
display r(p95)
gen outlier=(ratioALT7<r(p5)) | (ratioALT7>r(p95)) & ~missing(ratioALT7) & (medicaidFFS  | medicaidMCO)

summ ratioALT7 if MFFS | MA, detail
display r(p5)
display r(p95)
replace outlier=(ratioALT7<r(p5)) | (ratioALT7>r(p95)) & ~missing(ratioALT7) & (MFFS | MA)

summ ratioALT7 if third, detail
display r(p5)
display r(p95)
replace outlier=(ratioALT7<r(p5)) | (ratioALT7>r(p95)) & ~missing(ratioALT7) & (third)

gen imputation_medicare = ratioALT7*meanallowedMFFS
replace  imputation_medicare=. if meanallowedMFFS==0
replace imputation_medicare = . if imputation_medicare>abs(amt_billed_line)

label var imputation_medicare "Imputed Al. Amt. by % dif. from nat. MFFS (imputation < billed amt)"

drop ratio*

************************************************************************************
************************************************************************************
****** Create Payment Identicator, define line item value and drop outlier *********
************************************************************************************
************************************************************************************

// Create Payment indicator 
gen Y_X = 0								// create variable to indicate payment 
replace Y_X = 1 if (amt_paid_line > 0 & amt_paid_line != .) | (pat_resp_amt > 0 & amt_paid_line != .)	// indicate whether an item has been paid (this treats every item and resubmission separatly)


// Define line item value 
gen line_item_value 	= allwd_amt         if Y_X==1
replace line_item_value = imputation_cpt if Y_X==0 & line_item_value == . 
replace line_item_value = imputation_medicare if Y_X==0 & line_item_value == .
replace line_item_value = imputation_billing if Y_X==0 & line_item_value == .


// drop outlier: drop all visits with one line item > the 1% cutoff values defined before 
gen outlier_imputation = 0
replace outlier_imputation = 1 if line_item_value > max_cutoff & max_cutoff != . & line_item_value != .
replace outlier_imputation = 1 if line_item_value < min_cutoff & min_cutoff != . & line_item_value != .
replace outlier_imputation = 1 if line_item_value < 0.01 & line_item_value != .
bys visit_billing_id : gegen outlier_visit = max(outlier_imputation)
drop if outlier_visit > 0
bys visit_billing_id: gegen min_pi = min(line_item_value)
drop if min_pi < 0
drop min_pi outlier_visit outlier_imputation

gen missing = 0 
replace missing = 1 if line_item_value == . 
bys visit_billing_id: gegen max_missing = max(missing)
sum max_missing, d 
sum max_missing if max_missing > 0 
drop if max_missing == 1 
drop max_missing


**flag imputed claims**
gen imputed = 0 
replace imputed = 1 if Y_X==0 
bysort  claim_key:  egen imputed_claim  = max(imputed)
bysort visit_billing_id: egen imputed_visit  = max(imputed_claim == 1)


compress


*******************************************************
*******************************************************
********* Further, minor cleaning steps  **************
*******************************************************
*******************************************************

*get a continious variable indicating receive and check date 
tostring receive_date, gen(rd)		// Date the claim was received 
tostring check_date, gen(cd)		// date the claim was pricessed 
destring rd, gen(rd_num)			// we get a numeric, contiuous value for the receive date 
destring cd, gen(cd_num)			// we get a numeric, contiuous value for the check date  (date claim was processed)
 
 
//encode the procedure code 
encode procedure_cd, gen(proc_code)
//drop procedure_cd
rename proc_code proc1


gen     spec_code = .
replace spec_code = 1 if specialty_group == "Cardiologists"
replace spec_code = 2 if specialty_group == "Internal & Family Medicine"
replace spec_code = 3 if specialty_group == "Obstetricians & Gynecologists"
replace spec_code = 4 if specialty_group == "Orthopedists"
replace spec_code = 5 if specialty_group == "Pediatrics"


*dropping duplicate items of  all claims to identify submission more cleanly 
bys visit_billing_id rd cd amt_paid_line proc1 allwd_amt: gen counter_dup = _N // identify items within a visit which appear to be clear duplicates, i.e. same procedure code, same receive date, same check date, same paid amount, same allowed amount  

// try to drop duplicates with missing reason codes wherever possible so we have the original claim reason code 
replace  rcsa_remit_adj_cd_1 = "ZZZ" if rcsa_remit_adj_cd_1 == "" & counter_dup > 1
drop counter_dup

sort visit_billing_id rd cd amt_paid_line proc1 allwd_amt rcsa_remit_adj_cd_1
by visit_billing_id rd cd amt_paid_line proc1 allwd_amt: gen counter_dup = _n // identify items within a visit which appear to be clear duplicates, i.e. same procedure code, same receive date, same check date, same paid amount, same allowed amount  

drop if counter_dup > 1  // we drop duplicates as defined above (this drops around 14,000,000  observations)
drop counter_dup
replace rcsa_remit_adj_cd_1 = "" if rcsa_remit_adj_cd_1 == "ZZZ"

***********************************************************************
***********************************************************************
********* Determine Claim Characteristics for Final Data **************
***********************************************************************
***********************************************************************

/*
For the following determination of resubmission we assume the following: 
If we see claims being submitted ON THE SAME DAY AND PROCESSED ON THE SAME DAY we treat these as one claim. I.e. we abstract from the claim identifiers 
in the IQVIA data, which are not 100% reliable. 
Then, we assume that if a claim is submitted before the preceeding claim has been processed, it can hardly be a resubmission of the preceeding claim that is due to a denial. 
So we DO NOT treat a claim as a resubmission if it is submitted before the preceeding claim has been processed
*/

*Counting claims by the receive date and check date combination 
bys visit_billing_id rd cd: gen double counter_date = 1 if _n==1	// First item for each receive date/check date combination is identified 
by visit_billing_id: replace counter_date = sum(counter_date)		// summing over these items generates a continuous claim counter 
tostring counter_date, replace 
gen claim_id = 		visit_billing_id + 	counter_date									// new claim identifier based on recevie and check dates
destring claim_id, replace 									
replace claim_id = round(claim_id)								
format claim_id %30.0g
destring counter_date, gen(counter_claim)  									
compress


*gen variable to count the # of procedure code observations in visit 
bys visit_billing_id proc1 claim_id: gen counter_cpt = 1 if _n==1			// identfy the first procedure within a claim 	
bys visit_billing_id proc1: replace counter_cpt = sum(counter_cpt)			// sum to get the continuous number of cpt codes within a vist 
	
**********************************
** MAIN DETERMINATION OF CLAIMS **
**********************************

* check for receive date check date differences (the difference must be weakly positive for a claim to be considered a resubmission)
forval s = 1/7 { 

	gen cd_tmp_`s' = cd_num if counter_claim == `s' 		// generate temp variabel with the receive date corresponding to the claim # 
	gen rd_tmp_`s' = rd_num if counter_claim == `s' 		// generate temp variabel with the check date corresponding to the claim #	

	bys visit_billing_id: gegen cd_`s' = mean(cd_tmp_`s')	// generate variabel with the receive date corresponding to the claim # 
	bys visit_billing_id: gegen rd_`s' = mean(rd_tmp_`s')	// generate variabel with the check date corresponding to the claim #
	
	drop cd_tmp_`s' rd_tmp_`s'

}
	
forval s = 1/6 { 

	local a = `s' + 1
	local b = `s' + 2
	local c = `s' + 3
	local d = `s' + 4
	local e = `s' + 5
	local f = `s' + 6

	if `s' == 1 {
		
		gen date_dif_1_`a' = rd_`a' - cd_`s' // generate the date difference for first claim and second claim
		gen date_dif_1_`b' = rd_`b' - cd_`s' // generate the date difference for first claim and third claim
		gen date_dif_1_`c' = rd_`c' - cd_`s' // generate the date difference for first claim and fourth claim
		gen date_dif_1_`d' = rd_`d' - cd_`s' // generate the date difference for first claim and fifth claim
		gen date_dif_1_`e' = rd_`e' - cd_`s' // generate the date difference for first claim and sixth claim
		gen date_dif_1_`f' = rd_`f' - cd_`s' // generate the date difference for first claim and seventh claim

	}
	
	if `s' == 2 {
		
		gen date_dif_2_`a' = rd_`a' - cd_`s' // generate the date difference for second claim and third claim
		gen date_dif_2_`b' = rd_`b' - cd_`s' // generate the date difference for second claim and fourth claim
		gen date_dif_2_`c' = rd_`c' - cd_`s' // generate the date difference for second claim and fifth claim
		gen date_dif_2_`d' = rd_`d' - cd_`s' // generate the date difference for second claim and sisth claim
		gen date_dif_2_`e' = rd_`e' - cd_`s' // generate the date difference for second claim and seventh claim

	}
	
	if `s' == 3 {
		
		gen date_dif_3_`a' = rd_`a' - cd_`s' // generate the date difference for third claim and fourth claim
		gen date_dif_3_`b' = rd_`b' - cd_`s' // generate the date difference for third claim and fifth claim
		gen date_dif_3_`c' = rd_`c' - cd_`s' // generate the date difference for third claim and sixth claim
		gen date_dif_3_`d' = rd_`d' - cd_`s' // generate the date difference for third claim and seventh claim

	}
	
	if `s' == 4 {
		
		gen date_dif_4_`a' = rd_`a' - cd_`s' // generate the date difference for fourth claim and fifth claim
		gen date_dif_4_`b' = rd_`b' - cd_`s' // generate the date difference for fourth claim and sixth claim
		gen date_dif_4_`c' = rd_`c' - cd_`s' // generate the date difference for fourth claim and seventh claim

	}
	
	if `s' == 5 {
		
		gen date_dif_5_`a' = rd_`a' - cd_`s' // generate the date difference for fifth claim and sixth claim
		gen date_dif_5_`b' = rd_`b' - cd_`s' // generate the date difference for fifth claim and seventh claim

	}
	
	if `s' == 6 {
		
		gen date_dif_6_`a' = rd_`a' - cd_`s' // generate the date difference for sixth claim and seventh claim

	}

}
drop rd_* cd_*
	
*******************************************************
** Determine Resubmissions based on Date Differences **
*******************************************************

gen claim = 1 if counter_claim == 1 						  // the first claim is obviously identified 
replace claim = counter_claim if counter_claim == counter_cpt // assuming that if the consecutive number of claims matches the consecutive number of cpt codes we see, it is a resubmission of that cpt code
	
replace claim = 2 if counter_claim == 2 & date_dif_1_2 >= 0 & claim == .  // if the receive date of the second claim is following the check date of the first claim, its a resubmission and thus the second claim 
replace claim = 3 if counter_claim == 3 & date_dif_2_3 >= 0 & date_dif_1_2 >= 0 & claim == . // its the third claim if all preceeding date differences are weakly positive
replace claim = 4 if counter_claim == 4 & date_dif_3_4 >= 0 & date_dif_2_3 >= 0 & date_dif_1_2 >= 0 & claim == . // its the fourth claim if all preceeding date differences are weakly positive
replace claim = 5 if counter_claim == 5 & date_dif_4_5 >= 0 & date_dif_3_4 >= 0 & date_dif_2_3 >= 0 & date_dif_1_2 >= 0 & claim == . // its the fifth claim if all preceeding date differences are weakly positive
replace claim = 6 if counter_claim == 6 & date_dif_5_6 >= 0 & date_dif_4_5 >= 0 & date_dif_3_4 >= 0 & date_dif_2_3 >= 0 & date_dif_1_2 >= 0 & claim == . // its the sixth claim if all preceeding date differences are weakly positive
replace claim = 7 if counter_claim == 7 & date_dif_6_7 >= 0 & date_dif_5_6 >= 0 & date_dif_4_5 >= 0 & date_dif_3_4 >= 0 & date_dif_2_3 >= 0 & date_dif_1_2 >= 0 & claim == . // its the seventh claim if all preceeding date differences are weakly positive


replace claim = 1 if counter_claim == 2 & date_dif_1_2 < 0 & claim == .  // if the check date of the second claim is preceding the receive date of the first claim, we count this as the first claim 
replace claim = 1 if counter_claim == 3 & date_dif_1_3 < 0 & claim == . // if the check date of the third claim is preceding the receive date of the first claim, we count this as the first claim 
replace claim = 1 if counter_claim == 4 & date_dif_1_4 < 0 & claim == . // if the check date of the fourth claim is preceding the receive date of the first claim, we count this as the first claim 
replace claim = 1 if counter_claim == 5 & date_dif_1_5 < 0 & claim == . // if the check date of the fifth claim is preceding the receive date of the first claim, we count this as the first claim 
replace claim = 1 if counter_claim == 6 & date_dif_1_6 < 0 & claim == . // if the check date of the sixth claim is preceding the receive date of the first claim, we count this as the first claim 
replace claim = 1 if counter_claim == 7 & date_dif_1_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the first claim, we count this as the first claim 


replace claim = 2 if counter_claim == 3 & date_dif_2_3 < 0 & claim == .  // if the check date of the third claim is preceding the receive date of the second claim, we count this as the second claim (whether it preceeded the first claim was checked above)
replace claim = 2 if counter_claim == 4 & date_dif_2_4 < 0 & claim == . // if the check date of the fourth claim is preceding the receive date of the second claim, we count this as the second claim (whether it preceeded the first claim was checked above)
replace claim = 2 if counter_claim == 5 & date_dif_2_5 < 0 & claim == . // if the check date of the fifth claim is preceding the receive date of the second claim, we count this as the second claim (whether it preceeded the first claim was checked above)
replace claim = 2 if counter_claim == 6 & date_dif_2_6 < 0 & claim == . // if the check date of the sixth claim is preceding the receive date of the second claim, we count this as the second claim (whether it preceeded the first claim was checked above)
replace claim = 2 if counter_claim == 7 & date_dif_2_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the second claim, we count this as the second claim (whether it preceeded the first claim was checked above)


replace claim = 3 if counter_claim == 4 & date_dif_3_4 < 0 & claim == .  // if the check date of the fourth claim is preceding the receive date of the third claim, we count this as the third claim 
replace claim = 3 if counter_claim == 5 & date_dif_3_5 < 0 & claim == . // if the check date of the fifth claim is preceding the receive date of the third claim, we count this as the third claim 
replace claim = 3 if counter_claim == 6 & date_dif_3_6 < 0 & claim == . // if the check date of the sicth claim is preceding the receive date of the third claim, we count this as the third claim 
replace claim = 3 if counter_claim == 7 & date_dif_3_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the third claim, we count this as the third claim 

replace claim = 4 if counter_claim == 5 & date_dif_4_5 < 0 & claim == .  // if the check date of the fifth claim is preceding the receive date of the fourth claim, we count this as the fourth claim 
replace claim = 4 if counter_claim == 6 & date_dif_4_6 < 0 & claim == . // if the check date of the sixth claim is preceding the receive date of the fourth claim, we count this as the fourth claim 
replace claim = 4 if counter_claim == 7 & date_dif_4_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the fourth claim, we count this as the fourth claim 

replace claim = 5 if counter_claim == 6 & date_dif_5_6 < 0 & claim == .  // if the check date of the sixth claim is preceding the receive date of the fifth claim, we count this as the fifth claim 
replace claim = 5 if counter_claim == 7 & date_dif_5_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the fifth claim, we count this as the fifth claim 

replace claim = 6 if counter_claim == 7 & date_dif_6_7 < 0 & claim == . // if the check date of the seventh claim is preceding the receive date of the sixth claim, we count this as the sixth claim 


replace claim = 2 if date_dif_1_2 < 0 & date_dif_1_3 >= 0 & date_dif_1_3 != . & claim == . // if the second claim is submitted before the first claim is processed and there is a subsequent third claim, it must be the first resubmission 
replace claim = 2 if date_dif_1_2 < 0 & date_dif_1_3 < 0 & date_dif_1_4 >= 0 & date_dif_1_4 != . & claim == . // if the third claim is submitted before the first claim is processed and there is a subsequent fourth claim, it must be the first resubmission 
replace claim = 2 if date_dif_1_2 < 0 & date_dif_1_3 < 0 & date_dif_1_4 < 0 & date_dif_1_5 >= 0 & date_dif_1_5 != . & claim == . // if the fourth claim is submitted before the first claim is processed and there is a subsequent fifth claim, it must be the first resubmission 
replace claim = 2 if date_dif_1_2 < 0 & date_dif_1_3 < 0 & date_dif_1_4 < 0 & date_dif_1_5 < 0 & date_dif_1_6 >= 0 & date_dif_1_6 != . & claim == . // if the fifth claim is submitted before the first claim is processed and there is a subsequent sixth claim, it must be the first resubmission 
replace claim = 2 if date_dif_1_2 < 0 & date_dif_1_3 < 0 & date_dif_1_4 < 0 & date_dif_1_5 < 0 & date_dif_1_6 < 0 & date_dif_1_7 >= 0 & date_dif_1_7 != . & claim == . // if the fifth claim is submitted before the first claim is processed and there is a subsequent sixth claim, it must be the first resubmission 

replace claim = 1 if claim == . // treat claims as first submission if we cannot identify their true submission number (this can still be improved but only contains a tiny fraction of the data. ~1,200 observation in the 1% sample)

drop date_dif_*



**************************************
** Determine Payments of line items **
**************************************


forval s = 0/6 {
	
		local z = `s' + 1
		gen Y_tmp_`s' = .
		replace Y_tmp_`s' = 1 if claim == `z' & Y_X == 1 // item is paid, we enter 1 in the respective claim's period we have 
		replace Y_tmp_`s' = 0 if claim == `z' & Y_X == 0 // item is not paid, we enter 0 in the respective claim's period we have 
		

		bys visit_billing_id proc1: gegen Y_`s' = mean(Y_tmp_`s') // use payment values for all identical items (this anticipates the `reshaping' of the data)
		
		replace Y_`s' = Y_tmp_`s' if Y_`s' > 0 & Y_`s' < 1 // sometimes we see multiple identical items in one claim with heterogenous payments 

	}

drop Y_tmp_*




**********************************************************
** Determine Reason Codes/Allowed Amounts of line items **
**********************************************************

*create reason codes in string variables 
*rename/encode variables 
forval i = 1/5 {
	encode rcsa_remit_adj_cd_`i', gen(reason_`i')
	rename rcsa_remit_adj_cd_`i' reason_`i'_st
}


*determine value and reason code of item
forval s = 0/6 {

	local z = `s' + 1
	gen C_tmp_`s' = .
	gen imputed_tmp_`s' = . 
	
	replace C_tmp_`s' = line_item_value if claim == `z'  // allowed amount of item is generated in corresponding period 
	bys visit_billing_id proc1: gegen C_`s' = mean(C_tmp_`s') // mean allowed amounts for identical items in a visit and corresponding period
	drop C_tmp_`s'
	
	gen reason_tmp_a_`s' = reason_1_st if claim == `z' // primary reason code of item is generated in corresponding period 
	gen reason_tmp_b_`s' = reason_2_st if claim == `z'  // secondary reason code of item is generated in corresponding period 
	gen reason_tmp_c_`s' = reason_3_st if claim == `z' // tertiary reason code of item is generated in corresponding period 
	gen reason_tmp_d_`s' = reason_4_st if claim == `z' // fourth additional reason code of item is generated in corresponding period 
	gen reason_tmp_e_`s' = reason_5_st if claim == `z' // fifth additional reason code of item is generated in corresponding period 

} 



*fill in reason codes for all items in a period 
forval s = 0/6 {
	
	foreach l in a b c d e {
	
		gen reason_`l'_`s' = reason_tmp_`l'_`s'
		sort visit_billing_id proc1
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-1] if reason_`l'_`s' == ""
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-2] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-3] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-4] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-5] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n-6] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+1] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+2] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+3] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+4] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+5] if reason_`l'_`s' == "" 
		by visit_billing_id proc1: replace reason_`l'_`s' = reason_tmp_`l'_`s'[_n+6] if reason_`l'_`s' == "" 
	
	}

} 


// keep single observation per claim -- we have brought the data in a "wide" format 
bys visit_billing_id proc1: gen counter = _n 
keep if counter == 1 

// we have indicators for every period now 
drop Y_X

*Ensure variables are missing if item is paid 
forval s = 0/5 {
	
	local z = `s' + 1
	
	replace Y_`s' = 0 if Y_`z' != .
	
	replace line_item_value = C_`s' if Y_`s' == 1 // use value of paid amt if item is paid in subsequent periods 

	forval j = `z'/6 {
		
		replace Y_`j' = . if Y_`s' == 1
		replace C_`j' = . if Y_`s' == 1

	}
} 

//create flag for imputed values (use imputations whenever there is no payment for an item)
gegen paid_indicator = rowmax(Y_?)
gen imputed_value = 1 - paid_indicator
drop paid_indicator

*Determine Resubmission
forval s = 1/6 {

	local z = `s' - 1
	
	gen R_`s' = .
	replace R_`s' = 1 if Y_`s' != .
	replace R_`s' = 0 if Y_`z' == 0 & Y_`s' == .
	replace C_`s' = . if R_`s' == 0 | R_`s' == .

} 


keep idw_provider_id visit_billing_id claim_cntl_nbr idw_patient_id date_of_service state claim_key receive_date check_date  ///
spec_code reason_? bobfixed_payer_name_broad proc1  patient_age charlson dx1_code dx2_code dx3_code line_item_value ///
pay_type Y_* R_* C_*  claim  year reason_?_?  meanallowedMFFS rev_visit payee_state_cd zipcode line_item_value imputed_value

compress


*check all variables are as expected
forval s = 0/6 {

	local z = `s' + 1

	assert Y_`s' == 1 | Y_`s' == 0 | Y_`s' == . 

}


forval s = 0/5 {

	local z = `s' + 1

	assert R_`z' == 0 | R_`z' == 1  if Y_`s' == 0
	assert C_`z' == . if R_`z' == 0


}

replace C_0 = C_0 * (-1) if C_0 < 0 
assert C_0 >= 0 



forvalues x = 0/6 {

	replace reason_a_`x' = "" if Y_`x' == .
	replace reason_b_`x' = "" if Y_`x' == .
	replace reason_c_`x' = "" if Y_`x' == .
	replace reason_d_`x' = "" if Y_`x' == .
	replace reason_e_`x' = "" if Y_`x' == .

}

	
	
	
gen pay_type_st = "Medicare FFS" if pay_type == 1
replace  pay_type_st = "Medicaid FFS" if pay_type == 2
replace pay_type_st = "Medicaid MCO" if pay_type == 3
replace pay_type_st = "Medicare Advantage" if pay_type == 4
replace pay_type_st = "Private" if pay_type == 5 



*merge reason code classification
rename reason_a_0 reason_code
merge m:1 reason_code using `"$rc_input/$classification_file"'
drop if _merge == 2 
drop _merge 
replace code_cat = "unclassified" if code_cat == ""
rename code_cat code_cat_a_0
rename reason_code reason_code_a_0

rename reason_b_0 reason_code
merge m:1 reason_code using `"$rc_input/$classification_file"'
drop if _merge == 2 
drop _merge 
replace code_cat = "unclassified" if code_cat == ""
rename code_cat code_cat_b_0
rename reason_code reason_code_b_0

rename reason_c_0 reason_code
merge m:1 reason_code using `"$rc_input/$classification_file"'
drop if _merge == 2 
drop _merge 
replace code_cat = "unclassified" if code_cat == ""
rename code_cat code_cat_c_0
rename reason_code reason_code_c_0

replace code_cat_a_0 = code_cat_b_0 if code_cat_a_0 == "contractual" & reason_code_b_0 != "" & code_cat_b_0 != "contractual" & code_cat_b_0 != "unclassified" 
replace reason_code_a_0 = reason_code_b_0 if code_cat_a_0 == "contractual" & reason_code_b_0 != "" & code_cat_b_0 != "contractual" & code_cat_b_0 != "unclassified" 

replace code_cat_a_0 = code_cat_c_0 if code_cat_a_0 == "contractual" & reason_code_c_0 != "" & code_cat_c_0 != "contractual" & code_cat_c_0 != "unclassified" 
replace reason_code_a_0 = reason_code_c_0 if code_cat_a_0 == "contractual" & reason_code_c_0 != "" & code_cat_c_0 != "contractual" & code_cat_c_0 != "unclassified" 


replace code_cat_a_0 = code_cat_b_0 if code_cat_a_0 == "unclassified" & reason_code_b_0 != "" & code_cat_b_0 != "unclassified" 
replace reason_code_a_0 = reason_code_b_0 if code_cat_a_0 == "unclassified" & reason_code_b_0 != "" & code_cat_b_0 != "unclassified" 

replace code_cat_a_0 = code_cat_c_0 if code_cat_a_0 == "unclassified" & reason_code_c_0 != "" & code_cat_c_0 != "unclassified" 
replace reason_code_a_0 = reason_code_c_0 if code_cat_a_0 == "unclassified" & reason_code_c_0 != "" & code_cat_c_0 != "unclassified" 






drop if code_cat_a_0 == "unclassified"

bys pay_type code_cat_a_0: gen number = _N 
gen denied_0 = 1 if Y_0 == 0 
replace denied_0 = 0 if Y_0 == 1 
assert denied_0 != .
bys pay_type code_cat_a_0: gegen number_den = sum(denied_0)

gegen denied = rowtotal(Y_?)
replace denied = 1-denied
assert denied == 0 | denied == 1 



gen code_cat_a_0_num = 1 if code_cat_a_0 == "administrative"
replace code_cat_a_0_num = 2 if code_cat_a_0 == "contractual"
replace code_cat_a_0_num = 3 if code_cat_a_0 == "coverage"
replace code_cat_a_0_num = 4 if code_cat_a_0 == "duplicate"
replace code_cat_a_0_num = 5 if code_cat_a_0 == "information"


bys visit_billing_id: egen visit_code_cat = mode(code_cat_a_0_num), min

gen visit_code_cat_st = "administrative" if visit_code_cat == 1
replace visit_code_cat_st = "contractual" if visit_code_cat == 2
replace visit_code_cat_st = "coverage" if visit_code_cat == 3
replace visit_code_cat_st = "duplicate" if visit_code_cat == 4
replace visit_code_cat_st = "information" if visit_code_cat == 5

drop  visit_code_cat 

rename visit_code_cat_st visit_code_cat

keep visit_billing_id visit_code_cat idw_provider_id patient_age Y_? R_? charlson dx?_code line_item_value ///
proc1 state date_of_service idw_patient_id year spec_code meanallowedMFFS imputed_value payee_state_cd zipcode reason_? pay_type reason_code_?_?


compress 

gen pay_type_pooled = pay_type
replace pay_type_pooled = 2 if pay_type_pooled == 3 
replace pay_type_pooled = 1 if pay_type_pooled == 4 
replace pay_type_pooled = 3 if pay_type_pooled == 5
lab var pay_type_pooled "1: Medicare; 2: Medicaid; 3: Commercial"

gen pay_type_pooled_st = "Medicare" if pay_type_pooled == 1 
replace pay_type_pooled_st = "Medicaid" if pay_type_pooled == 2
replace pay_type_pooled_st = "Commercial" if pay_type_pooled == 3  


lab var pay_type_pooled_st "Pay Type in string format"
lab var payee_state_cd "State Identifier (string)"
lab var state "State Identifier (FIPS code)"

lab var R_1 "resubmission indentifier: ==1 if item is resubmitted in first period"
lab var R_2 "resubmission indentifier: ==1 if item is resubmitted in second period"
lab var R_3 "resubmission indentifier: ==1 if item is resubmitted in third period"
lab var R_4 "resubmission indentifier: ==1 if item is resubmitted in fourth period"
lab var R_5 "resubmission indentifier: ==1 if item is resubmitted in fifth period"
lab var R_6 "resubmission indentifier: ==1 if item is resubmitted in sixth period"

lab var Y_0 "payment identifier: == 1 if item is payed after initial submission"
lab var Y_1 "payment identifier: == 1 if item is payed after first resubmission"
lab var Y_2 "payment identifier: == 1 if item is payed after second resubmission"
lab var Y_3 "payment identifier: == 1 if item is payed after third resubmission"
lab var Y_4 "payment identifier: == 1 if item is payed after fourth resubmission"
lab var Y_5 "payment identifier: == 1 if item is payed after fifth resubmission"
lab var Y_6 "payment identifier: == 1 if item is payed after sixth resubmission"

lab var reason_1 "Primary Reason"
lab var reason_2 "Secondary Reason"
lab var reason_3 "Tertiary Reason"
lab var reason_4 "Quaternary Reason"
lab var reason_5 "Quinary Reason"

lab var charlson "Charlson Score"
lab var date_of_service "Date of Service"
lab var dx1_code "Primary Diagnosis code"
lab var dx2_code "Secondary Diagnosis code"
lab var dx3_code "Tertiary Diagnosis code"
lab var visit_billing_id "Visit ID"

lab var idw_provider_id "Physician ID"
lab var imputed_value "flag for imputed values, ==1 of line item value is imuputed"
lab var line_item_value "Value of the line item"
lab var meanallowedMFFS "RVU Proxy"
lab var patient_age "Patient Age"
lab var proc1 "Procedure Code"
lab var idw_patient_id "Patient ID"
lab var spec_code "Phys. spec. 1:Cardiologist;2:Internal&Fmly;3:O&B;4:Ortopedist;5:Pediatrist"
lab var visit_code_cat "Reason code cateogry of visit"
lab var year "Calendar Year"
lab var zipcode "Physician Zipcode"


save `"$output/data/$line_file_temp"', replace 


log close
